melb_df <- read_csv(here("data", "melb_temp.csv")) %>%
clean_names() %>%
rename(temp = maximum_temperature_degree_c) %>%
filter(!is.na(temp)) %>%
select(year, month, day, temp) %>%
mutate(date = as.Date(paste(year, month, day, sep = "-")),
# so there is a common scale for x axis
# year 2000 is a dummy
dummy_mday = ymd(paste0(2000, month, day, sep = "-")))
There are a number of graphics that you could chose for comparison which focus on particular features to compare. E.g.
winter_df <- melb_df %>%
filter(month %in% c("06", "07", "08"))
ggplot(winter_df, aes(dummy_mday, temp)) +
geom_point(data = select(winter_df, -year),
color = "gray", size = 0.1) +
geom_line() +
facet_wrap(~year) +
labs(tag = "(A)", x = "Month",
y = "Temperature (°C)")
ggplot(winter_df, aes(temp, as.factor(year))) +
geom_boxplot() +
labs(tag = "(B)", y = "Year",
x = "Temperature (°C)")
winter_df %>%
mutate(year = fct_reorder(as.factor(year), temp)) %>%
ggplot(aes(temp, year)) +
geom_boxplot(aes(color = as.numeric(as.character(year)))) +
labs(tag = "(C)", y = "Year",
x = "Temperature (°C)",
color = "Year") +
scale_color_continuous_divergingx(mid = 1995)
winter_df %>%
mutate(pre1995 = ifelse(year < 1995, "Pre-1995", "Post-1995")) %>%
ggplot(aes(pre1995, temp)) +
geom_boxplot() +
labs(tag = "(D)", y = "Temperature (°C)",
x = "Time Period") +
scale_color_continuous_divergingx(mid = 1995)
df <- tribble(~year, ~offense, ~victim,
2000, "Anti-Black", 3535,
2000, "Sexual Orientation", 1558,
2000, "Anti-Islamic", 36,
2001, "Anti-Black", 3700,
2001, "Sexual Orientation", 1664,
2001, "Anti-Islamic", 554,
2002, "Anti-Black", 3076,
2002, "Sexual Orientation", 1513,
2002, "Anti-Islamic", 174) %>%
mutate(offense = fct_reorder(offense, -victim))
pop_df <- tribble(~pop, ~size,
"Anti-Black", 36.4e6,
"Sexual Orientation", 28.2e6,
"Anti-Islamic", 3.4e6)
crime_df <- left_join(df, pop_df, by = c("offense" = "pop")) %>%
mutate(prop = victim / size)
ggplot(crime_df, aes(as.factor(year), victim, color = offense)) +
geom_point() +
geom_line(aes(group = offense)) +
scale_color_discrete_qualitative() +
labs(x = "Year", y = "The number of victims",
color = "Offense", tag = "(A)")
ggplot(crime_df, aes(as.factor(year), prop * 10000, color = offense)) +
geom_point() +
geom_line(aes(group = offense)) +
scale_color_discrete_qualitative() +
labs(x = "Year", y = "Incidence estimate per 10,000 people",
color = "Offense", tag = "(B)")
year2000dict <- crime_df %>%
filter(year == 2000) %>%
select(offense, prop) %>%
deframe()
crime_df %>%
mutate(rel2000 = prop / year2000dict[offense]) %>%
filter(year != 2000) %>%
ggplot(aes(as.factor(year), rel2000, color = offense)) +
geom_point() +
geom_line(aes(group = offense)) +
scale_color_discrete_qualitative() +
scale_y_continuous(breaks = c(1, 4, 5, 15, 16)) +
labs(x = "Year", y = "Odds ratio with respect to year 2000",
color = "Offense", tag = "(C)")
data(Cars93, package = "MASS")
The driving condition is unknown for mtcars dataset but if we assume that it would be similar to city then if we compare the distribution between the gallon per 100 miles for the two datasets, there is a small descrease in the gallon needed to drive 100 miles for cars in the Cars93 dataset than those in the mtcars dataset. This could be because the car models are from 1993 for the former and 1974 for the latter and technological advances made the petrol consumption more efficient in cars.
df <- bind_rows(tibble(mpg = mtcars$mpg,
year = 1974,
loc = "unknown"),
tibble(mpg = Cars93$MPG.city,
year = 1993,
loc = "city"),
tibble(mpg = Cars93$MPG.highway,
year = 1993,
loc = "highway")) %>%
mutate(gp100m = 1 / (100 * mpg)) %>%
mutate(loc = fct_reorder(loc, gp100m),
year = factor(year))
ggplot(df, aes(loc, gp100m)) +
geom_violin(aes(fill = year)) +
geom_boxplot(width = 0.1) +
labs(x = "Driving condition", y = "Gallon per 100 miles",
fill = "Year") +
scale_fill_discrete_qualitative()
Below I use the default var.equal = FALSE as the spread appears to be slightly larger in 1974.
with(df,
t.test(gp100m[loc=="city"],
gp100m[loc=="unknown"]))
##
## Welch Two Sample t-test
##
## data: gp100m[loc == "city"] and gp100m[loc == "unknown"]
## t = -2.356, df = 38.608, p-value = 0.02365
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1.344810e-04 -1.021431e-05
## sample estimates:
## mean of x mean of y
## 0.0004699247 0.0005422724
MPG.city and 1/MPG.city against Weight for the Cars93 dataset. What conclusion do you draw and which scatter plot do you prefer?Plot (A) shows a linear trend where the petrol consumed to drive each mile increases linearly with the increase in weight of the car.
Plot (B) show a quadratic decrease in miles that car travelled per gallon for each unit increase in weight of the car.
ggplot(Cars93, aes(Weight, 1/MPG.city)) +
geom_point() +
geom_smooth() +
labs(tag = "(A)")
ggplot(Cars93, aes(Weight, MPG.city)) +
geom_point() +
geom_smooth() +
labs(tag = "(B)")
data(bank, package = "gclus")
Right and Left, measurements of the right and left edge widths of the notes, respectively. What do the distribution of the differences between these measurements for each note look like for the two groups? Are the differences significantly different from zero?Looking at the plot below, it is easy to see that the edge length on the left seems to be generally longer for the left for the genuine Swiss bank notes but appear less so for counterfeit notes. There are a couple of what appears like outliers in the difference of right and left lengths in the genuine notes.
A Wilcoxon rank sum test suggests that the mean differences are different between the two groups and this is still the case even removing the two outlying observations.
bank %>%
mutate(Status = fct_recode(as.character(Status),
genuine = '0',
counterfeit = '1')) %>%
ggplot(aes(Right - Left)) +
geom_histogram(binwidth = 0.1, color = "white") +
geom_vline(xintercept = 0, color = "red") +
facet_grid(Status ~ .)
with(mutate(bank, diff = Right - Left),
wilcox.test(diff[Status==0],
diff[Status==1]))
##
## Wilcoxon rank sum test with continuity correction
##
## data: diff[Status == 0] and diff[Status == 1]
## W = 3901.5, p-value = 0.00711
## alternative hypothesis: true location shift is not equal to 0
with(mutate(bank, diff = Right - Left) %>%
filter(diff > -1),
wilcox.test(diff[Status==0],
diff[Status==1]))
##
## Wilcoxon rank sum test with continuity correction
##
## data: diff[Status == 0] and diff[Status == 1]
## W = 3901.5, p-value = 0.01299
## alternative hypothesis: true location shift is not equal to 0
Bottom and Top for the margin widths might also be expected to be close to equal for each note. Are they and does the difference relate to the edge width differences?Genuine Swiss banknotes generally seem to have longer length for the top than bottom dimension. For the genuine note, bigger difference in margin generally implies bigger difference in edges.
df <- bank %>%
mutate(difftb = Top - Bottom,
diffrl = Right - Left) %>%
mutate(Status = fct_recode(as.character(Status),
genuine = '0',
counterfeit = '1'))
ggplot(df, aes(difftb)) +
geom_histogram(bindwidth = 0.1, color = "white") +
facet_grid(Status ~ .) +
geom_vline(xintercept = 0, color = "red") +
labs(x = "Top - Bottom")
ggplot(df, aes(difftb, diffrl)) +
facet_wrap(~Status) +
geom_vline(xintercept = 0, color = "red") +
geom_hline(yintercept = 0, color = "red") +
geom_point() +
labs(x = "Top - Bottom", y = "Right - Left")
There are four different combinations of proportionate differences in edge and margin lengths as shown in Plot (A)-(D). None of these are particularly striking to differentiate between the genuine and counterfeit banknotes. The metric used in Plot (E), engineered by Sherry Zhang, does a splendid job of segregating the two group types.
ggplot(df, aes(abs(Right - Left)/Right, abs(Top - Bottom)/Top)) +
geom_point() +
facet_wrap(~Status) +
labs(tag = "(A)")
ggplot(df, aes(abs(Right - Left)/Right, abs(Top - Bottom)/Bottom)) +
geom_point() +
facet_wrap(~Status) +
labs(tag = "(B)")
ggplot(df, aes(abs(Right - Left)/Left, abs(Top - Bottom)/Top)) +
geom_point() +
facet_wrap(~Status) +
labs(tag = "(C)")
ggplot(df, aes(abs(Right - Left)/Left, abs(Top - Bottom)/Bottom)) +
geom_point() +
facet_wrap(~Status) +
labs(tag = "(D)")
ggplot(df, aes((Right + Left)/mean(Right + Left), (Top + Bottom)/mean(Top + Bottom))) +
facet_wrap(~Status) +
geom_point() +
labs(tag = "(E)")
data(Olkin95, package = "meta")
The event rate appears to be higher in general for the control group than the experimental group.
ggplot(Olkin95, aes(event.e / n.e, event.c / n.c)) +
geom_point() +
geom_abline(slope = 1, intercept = 0) +
labs(x = "Event rate in the experimental group",
y = "Event rate in the control group")
The experimental group generally appears to have a lower rate of events than the control group. This particularly seems to be the case for larger studies.
ggplot(Olkin95, aes((n.e + n.c), event.e / n.e - event.c / n.c)) +
labs(x = "Study size", y = "Difference in the rate of event (Experimental - Control)") +
scale_x_log10() +
geom_hline(yintercept = 0, color = "red") +
geom_point()